library(tidyverse)
library(dplyr)
library(tidyr)
library(lubridate)
library(forecast)
library(tseries)
library(modeltime)
library(tidymodels)
library(prophet)
library(timetk)
setwd("~/Desktop/Data Science Projects/Oil & Gas Forecasting")
data <- read.csv("well_data.csv")series <- data %>%
select(period, well_name) |>
mutate(value = 1) %>%
pivot_wider(names_from = well_name, values_from = value, values_fill = list(value = 0)) %>%
arrange(period)
series |>
slice_head(n=20)## # A tibble: 20 × 60
## period FIELD11A FIELD12 FIELD13 FIELD14 FIELD15A FIELD16 FIELD17 FIELD178
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1980-07-01 0 0 0 0 0 0 0 0
## 2 1980-08-01 0 0 0 0 0 0 0 0
## 3 1980-09-01 0 0 0 0 0 0 0 0
## 4 1980-10-01 0 0 0 0 0 0 0 0
## 5 1980-11-01 0 0 0 0 0 0 0 0
## 6 1980-12-01 0 0 0 0 0 0 0 0
## 7 1981-02-01 0 0 0 0 0 0 0 0
## 8 1981-03-01 0 0 0 0 0 0 0 0
## 9 1981-04-01 0 0 0 0 0 0 0 0
## 10 1981-05-01 0 0 0 0 0 0 0 0
## 11 1981-06-01 0 0 0 0 0 0 0 0
## 12 1981-07-01 0 0 0 0 0 0 0 0
## 13 1981-08-01 0 0 0 0 0 0 0 0
## 14 1981-09-01 0 0 0 0 0 0 0 0
## 15 1981-10-01 0 0 0 0 0 0 0 0
## 16 1981-11-01 0 0 0 0 0 0 0 0
## 17 1981-12-01 0 0 0 0 0 0 0 0
## 18 1982-01-01 0 0 0 0 0 0 0 0
## 19 1982-02-01 0 0 0 0 0 0 0 0
## 20 1982-03-01 0 0 0 0 0 0 0 0
## # ℹ 51 more variables: FIELD19 <dbl>, FIELD198 <dbl>, FIELD1B <dbl>,
## # FIELD20 <dbl>, FIELD21 <dbl>, FIELD211 <dbl>, FIELD216 <dbl>,
## # FIELD221 <dbl>, FIELD223 <dbl>, FIELD282 <dbl>, FIELD31 <dbl>,
## # FIELD32 <dbl>, FIELD34A <dbl>, FIELD35 <dbl>, FIELD37 <dbl>,
## # FIELD39A <dbl>, FIELD4 <dbl>, FIELD43 <dbl>, FIELD5 <dbl>, FIELD51 <dbl>,
## # FIELD51A <dbl>, FIELD52 <dbl>, FIELD53 <dbl>, FIELD53A <dbl>,
## # FIELD55 <dbl>, FIELD55D <dbl>, FIELD56 <dbl>, FIELD57 <dbl>, …
well_characteristics <- data %>%
rename(date = period) |>
group_by(well_name) %>%
summarise(
Avg_Gas_to_Oil_Ratio = mean(gas_total / oil, na.rm = TRUE),
Months_of_Production = n(),
Initial_Production_Date = min(date),
Avg_Monthly_Gas_Decline_Rate = {
gas_diff <- diff(gas_total)
gas_lag <- lag(gas_total, default = first(gas_total))[-1]
rate <- gas_diff / gas_lag
rate[is.infinite(rate)] <- NA
mean(rate, na.rm = TRUE)
}
) %>%
ungroup()
print(well_characteristics)## # A tibble: 59 × 5
## well_name Avg_Gas_to_Oil_Ratio Months_of_Production Initial_Production_Date
## <chr> <dbl> <int> <chr>
## 1 FIELD11A 23.7 91 1985-03-01
## 2 FIELD12 23.2 87 1982-11-01
## 3 FIELD13 29.7 140 1986-09-01
## 4 FIELD14 23.9 80 1986-07-01
## 5 FIELD15A 23.4 119 1988-02-01
## 6 FIELD16 23.3 4 1987-06-01
## 7 FIELD17 24.3 89 1990-04-01
## 8 FIELD178 22.8 11 1986-05-01
## 9 FIELD19 23.5 35 1990-08-01
## 10 FIELD198 23.7 31 1986-11-01
## # ℹ 49 more rows
## # ℹ 1 more variable: Avg_Monthly_Gas_Decline_Rate <dbl>
wells_ge_24 <- well_characteristics |>
filter(Months_of_Production >= 24)
cleaned_data <- data |>
filter(well_name %in% wells_ge_24$well_name & oil != 0 & gas_total != 0) |>
rename(date = period)Combine data for all Wells into a single aggregated dataframe
data_aggregated <- cleaned_data %>%
mutate(date = as.Date(date, format = "%Y-%m-%d")) |>
group_by(date) %>%
summarise(total_oil = sum(oil, na.rm = TRUE),
total_gas = sum(gas_total, na.rm = TRUE)) |>
arrange(date)
head(data_aggregated)## # A tibble: 6 × 3
## date total_oil total_gas
## <date> <dbl> <dbl>
## 1 1980-07-01 2136. 51426.
## 2 1980-08-01 8435. 228568.
## 3 1980-09-01 11560. 312667.
## 4 1980-10-01 1075. 10971.
## 5 1980-11-01 1954. 36393.
## 6 1980-12-01 7270. 122097.
Plot the full time series to observe the behavior of the data.
ggplot(data_aggregated, aes(x = date, y = total_oil)) +
geom_line(color = "blue") +
labs(title = "Oil Production Over Time",
x = "Month",
y = "barrels/d") +
theme_minimal()
The data appears to display high volatility between 1980 and 2010 with the trend showing stabilization after 2010. Due to this, I made the decision to remove the data prior to 2010 and focus on 2010-2018 data for forecasting. Given that the goal is to forecast the next 6 months, the last 8 years of data seems to be more applicable.
data_subset <- data_aggregated |>
filter(date >= as.Date("2010-01-01"))
data_subset_oil <- data_subset |>
select(date, total_oil)
data_subset_oil |> plot_time_series(date, total_oil)Looking at the plot above, we can see that there may be an outlier between 2015 and 2016. I’ll go ahead and include a step for handling outliers.
Q1 <- quantile(data_subset_oil$total_oil, 0.25)
Q3 <- quantile(data_subset_oil$total_oil, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- which(data_subset_oil$total_oil < lower_bound | data_subset_oil$total_oil > upper_bound)
median_value <- median(data_subset_oil$total_oil, na.rm = TRUE)
data_subset_oil$total_oil[outliers] <- median_valueLet’s take a look at the data after handling outliers
I’ll use the ADF test to test for stationarity
strtyr <- min(year(data_subset_oil$date))
ts_data <- ts(data_subset_oil$total_oil,
frequency = 12,
start = c(strtyr, 1))
adf_oil <- adf.test(ts_data, alternative = "stationary")
print(adf_oil)##
## Augmented Dickey-Fuller Test
##
## data: ts_data
## Dickey-Fuller = -1.7646, Lag order = 4, p-value = 0.6743
## alternative hypothesis: stationary
With a p-value of 0.6743, the results show no-stationarity. I’ll try first differencing.
diff_ts_data <- diff(ts_data)
adf_test_diff <- adf.test(diff_ts_data, alternative = "stationary")
print(adf_test_diff)##
## Augmented Dickey-Fuller Test
##
## data: diff_ts_data
## Dickey-Fuller = -4.8192, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
P-value is now below 0.05, showing stationarity after differencing.
I’ll now go ahead and split the data into training and test sets and observe the performance of the models auto_arima, prophet, prophet_xgboost, glmnet, and auto_arima_boost.
splits <- time_series_split(
data_subset_oil,
assess = "12 months",
cumulative = TRUE
)
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(date, total_oil)# FORECAST ----
# * AUTO ARIMA ----
model_arima <- arima_reg() %>%
set_engine("auto_arima") %>%
fit(total_oil ~ date, training(splits))
# * Prophet ----
model_prophet <- prophet_reg(
seasonality_yearly = FALSE,
) %>%
set_engine("prophet") %>%
fit(total_oil ~ date, training(splits))
# * Prophet XGBoost----
model_prophet_xg <- prophet_boost(
seasonality_yearly = FALSE
) %>%
set_engine("prophet_xgboost") %>%
fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
training(splits))
# * Machine Learning - GLM ----
library(glmnet)
model_glmnet <- linear_reg(penalty = 0.1
,mixture = 0.5) %>%
set_engine("glmnet") %>%
fit(
total_oil ~ month(date, label = TRUE)
+ as.numeric(date),
training(splits)
)
# Auto ARIMA with XGBoost
model_arima_boost <- arima_boost() |>
set_engine("auto_arima_xgboost") |>
fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
training(splits))model_tbl <- modeltime_table(
model_arima,
model_prophet,
model_prophet_xg,
model_glmnet,
model_arima_boost
)Calculate the accuracy of the models on the training set and the test set.
calib_tbl_train <- model_tbl %>%
modeltime_calibrate(training(splits))
calib_tbl <- model_tbl %>%
modeltime_calibrate(testing(splits))
calib_tbl_train %>% modeltime_accuracy()## # A tibble: 5 × 9
## .model_id .model_desc .type mae mape mase smape rmse rsq
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 ARIMA(0,1,1) Fitt… 908. 8.77 0.964 8.64 1212. 0.822
## 2 2 PROPHET Fitt… 2111. 20.1 2.24 18.8 2454. 0.258
## 3 3 PROPHET W/ XGBOOST ERRORS Fitt… 135. 1.38 0.143 1.36 182. 0.996
## 4 4 GLMNET Test 2093. 20.0 2.22 18.7 2441. 0.266
## 5 5 ARIMA(0,1,1) W/ XGBOOST E… Fitt… 266. 2.62 0.283 2.59 375. 0.983
## # A tibble: 5 × 9
## .model_id .model_desc .type mae mape mase smape rmse rsq
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 ARIMA(0,1,1) Test 693. 6.11 1.57 6.39 874. NA
## 2 2 PROPHET Test 2337. 21.1 5.30 23.9 2487. 0.685
## 3 3 PROPHET W/ XGBOOST ERRO… Test 1177. 10.4 2.67 11.2 1522. 0.839
## 4 4 GLMNET Test 2335. 21.0 5.30 23.9 2518. 0.417
## 5 5 ARIMA(0,1,1) W/ XGBOOST… Test 714. 6.36 1.62 6.61 933. 0.0503
Based on the summaries of the models between both the training set
and the test set, the model ARIMA (0,1,1) with XGBoost
seems to be performing the best out of the 4 other models after
reviewing the rmse and mae values
between both the training set and the test set.
Plot the residuals of the training set
residuals_timeplot_train <- model_tbl |>
modeltime_calibrate(training(splits)) |>
modeltime_residuals() |>
plot_modeltime_residuals(
.type = "timeplot",
.interactive = TRUE
)
residuals_timeplot_trainLooking at the residuals, the models Prophet with XGBoost, and ARIMA(0,1,1) with XGBoost seem to be capturing the overall trend and variability in the data reasonably well.
Plot the ACF of the residuals for the training set
residuals_acf_train <- model_tbl |>
modeltime_calibrate(training(splits)) |>
modeltime_residuals() |>
plot_modeltime_residuals(
.type = "acf",
.interactive = TRUE
)
residuals_acf_trainARIMA(0,1,1), Prophet with XBoost,
and ARIMA(0,1,1) with XGBoost show no significant
auto-correlation, although ARIMA (0,1,1) with XGBoost
crosses the bounds of the confidence interval slighlty more than
ARIMA (0,1,1) and Prophet with
XGBoost.
Display the forecast for the next 6 months using the 5 models that were tested, for comparison.
future_forecast_tbl <- calib_tbl %>%
modeltime_refit(data_subset_oil) %>%
modeltime_forecast(
h = "6 months",
actual_data = data_subset_oil
) %>%
plot_modeltime_forecast()
future_forecast_tblIn summary, I started my exploratory analysis by evaluating the long term trend of the oil production data and the benefit of having the full data during forecasting. I concluded that the data from 2010 and onwards seemed more suitable and applicable for forecasting the next 6 months.
Outliers were observed during the period between 2015 and 2016 that could’ve affected the modeling results moving forward. I went ahead and addressed the outliers using the IQR method for outlier detection.
In the forecasting phase, I utilized the following models ARIMA, ARIMA with XGBoost, Prophet, Prophet with XGBoost, and Elastic Net (GLMNET). Based on the results of the models and the distribution of the residuals, it appears that ARIMA with XGBoost performs the best based on the rmse and mae along with the residual plots of both the training and the test sets.
insert_well_forecast <- function(well) {
df <- cleaned_data |>
filter(well_name == well) |>
mutate(Months_of_Production = row_number()) |>
rename(total_oil = oil) |>
mutate(date = as.Date(date, format = "%Y-%m-%d"))
### Handling Outliers
Q1 <- quantile(df$total_oil, 0.25)
Q3 <- quantile(df$total_oil, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- which(df$total_oil < lower_bound | df$total_oil > upper_bound)
median_value <- median(df$total_oil, na.rm = TRUE)
df$total_oil[outliers] <- median_value
split_index <- round(max(df$Months_of_Production) * 0.2)
df <- df |>
select(date, total_oil, well_name)
splits <- time_series_split(
df,
assess = split_index,
cumulative = TRUE
)
# FORECAST ----
# * Prophet ----
model_prophet <- prophet_reg(
seasonality_yearly = FALSE
) %>%
set_engine("prophet") %>%
fit(total_oil ~ date, training(splits))
# * Prophet XGBoost----
model_prophet_xg <- prophet_boost(
seasonality_yearly = FALSE
) %>%
set_engine("prophet_xgboost") %>%
fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
training(splits))
# * AUTO ARIMA ----
model_arima <- arima_reg() %>%
set_engine("auto_arima") %>%
fit(total_oil ~ date, training(splits))
# STLM ETS
model_arima_boost <- arima_boost() |>
set_engine("auto_arima_xgboost") |>
fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
training(splits))
model_arima_boost
# MODELTIME COMPARE ----
# * Modeltime Table ----
model_tbl <- modeltime_table(
model_arima_boost
)
# * Calibrate ----
calib_tbl <- model_tbl %>%
modeltime_calibrate(testing(splits))
# * Accuracy ----
model_accuracy <- calib_tbl %>% modeltime_accuracy()
model_accuracy
future_forecast_tbl <- calib_tbl %>%
modeltime_refit(df) %>%
modeltime_forecast(
h = "6 months",
actual_data = df
)
plot_data <- future_forecast_tbl |>
select(.model_desc, .key, .index, .value, .conf_lo, .conf_hi) |>
rename(model = .model_desc,
key = .key,
date = .index,
value = .value,
conf_lo = .conf_lo,
conf_hi = .conf_hi) |>
mutate(Months_of_Production = row_number())
display_forecast <- ggplot(plot_data, aes(x = Months_of_Production, y = value, color = key)) +
geom_line() +
geom_ribbon(aes(ymin = conf_lo, ymax = conf_hi), fill = "blue", alpha = 0.2) +
labs(title = paste0("Projected Oil Production Growth (",well,") from",
year(min(plot_data$date)),"-", year(max(plot_data$date))), x = "Date", y = "Value") +
scale_x_continuous(breaks = seq(1, max(plot_data$Months_of_Production), by = 12)) +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y = "Barrels/d",
x = "Months")
display_forecast
return(display_forecast)
}forecasted_viz_11A <- insert_well_forecast("FIELD11A")
forecasted_viz_58 <- insert_well_forecast("FIELD58")
forecasted_viz_11A